https://prezi.com/view/zYGal9DWEFulQ9FjeTOM/
library(tm)
## Loading required package: NLP
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
songs <- read.csv("prepped_data.csv")
songs <- rbind(songs, songs2013)
stopWords <- c("chorus ", "verse ", "intro ", "1 ", "bridge ", "2 ", "Chorus ", "Intro ", "Verse ", "Bridge ")
songs <- songs %>%
select(Album, Artist, Lyrics, Rank, Song.Title, Year, Verbs, Nouns, Adverbs, Word.Counts, Corpus) %>%
mutate(Lyrics = as.character(Lyrics)) %>%
mutate(Verbs = as.character(Verbs)) %>%
mutate(Nouns = as.character(Nouns)) %>%
mutate(Adverbs = as.character(Adverbs)) %>%
mutate(Nouns = removeWords(Nouns, stopWords)) %>%
mutate(Decade = ifelse(Year < 1980, 1970, ifelse(Year < 1990, 1980, ifelse(Year < 2000, 1990, ifelse(Year < 2010, 2000, 2010))))) %>%
mutate(Corpus = removeWords(tolower(as.character(Corpus)), stopWords))
library(tidyverse)
library(stringr)
library(wordcloud2)
library(tidytext)
library(Rcpp)
songtoken <- songs %>%
select(Nouns) %>%
unnest_tokens(output = word, input = Nouns)
song_tidy <- songtoken %>%
group_by(word) %>%
count() %>%
arrange(desc(n))
wordcloud2(data = song_tidy[1:100,], size = 1, color = "random-light")
songs70 <- songs %>%
filter(Decade == 1970)
songs80 <- songs %>%
filter(Decade == 1980)
songs90 <- songs %>%
filter(Decade == 1990)
songs00 <- songs %>%
filter(Decade == 2000)
songs10 <- songs %>%
filter(Decade == 2010)
yearlist <- c(1970:2019)
top10 <- c()
for(i in c(1:10)){
top10 <- c(top10, song_tidy[i, 1])
}
masterwordlist <- data.frame()
for (i in yearlist){
yearsongtoken <- songs %>%
filter(Year == i) %>%
select(Nouns) %>%
unnest_tokens(output = word, input = Nouns)
year_song_tidy <- yearsongtoken %>%
filter(word == top10[1] | word == top10[2] | word == top10[3] |word == top10[4] |word == top10[5] |word == top10[6] |word == top10[7] |word == top10[8] |word == top10[9] |word == top10[10]) %>%
group_by(word) %>%
count() %>%
arrange(desc(n)) %>%
mutate(Year = i)
if (length(masterwordlist) == 0 ){
masterwordlist <- year_song_tidy
}else{
masterwordlist <- merge(year_song_tidy, masterwordlist, all = TRUE)
}
}
songtoken70 <- songs70 %>%
select(Song.Title, Nouns) %>%
unnest_tokens(output = word, input = Nouns)
song_tidy70 <- songtoken70 %>%
group_by(word) %>%
count() %>%
arrange(desc(n))
wordcloud2(data = song_tidy70[1:100,], size = 1, color = "random-light")
songtoken80 <- songs80 %>%
select(Song.Title, Nouns) %>%
unnest_tokens(output = word, input = Nouns)
song_tidy80 <- songtoken80 %>%
group_by(word) %>%
count() %>%
arrange(desc(n))
wordcloud2(data = song_tidy80[1:100,], size = 1, color = "random-light")
songtoken90 <- songs90 %>%
select(Song.Title, Nouns) %>%
unnest_tokens(output = word, input = Nouns)
song_tidy90 <- songtoken90 %>%
group_by(word) %>%
count() %>%
arrange(desc(n))
wordcloud2(data = song_tidy90[1:100,], size = 1, color = "random-light")
songtoken00 <- songs00 %>%
select(Song.Title, Nouns) %>%
unnest_tokens(output = word, input = Nouns)
song_tidy00 <- songtoken00 %>%
group_by(word) %>%
count() %>%
arrange(desc(n))
wordcloud2(data = song_tidy00[1:100,], size = 1, color = "random-light")
songtoken10 <- songs10 %>%
select(Song.Title, Nouns) %>%
unnest_tokens(output = word, input = Nouns)
song_tidy10 <- songtoken10 %>%
group_by(word) %>%
count() %>%
arrange(desc(n))
wordcloud2(data = song_tidy10[1:100,], size = 1, color = "random-light")
ggplot(masterwordlist, aes(x = Year, y = n, color = word)) + geom_line()
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(dplyr)
library(tidyverse)
arrangedmasterwordlist <- masterwordlist %>%
group_by(word) %>%
arrange((Year))
widemasterwordlist <- data.frame(pivot_wider(arrangedmasterwordlist, id_cols = word, names_from = Year, values_from = n))
row.names(widemasterwordlist) <- widemasterwordlist$word
masterwordmatrix <- data.matrix(widemasterwordlist[,-1])
wordheatmap <- heatmap(masterwordmatrix, Rowv = NA, Colv = NA, col = heat.colors(256), scale = "column", margins = c(5, 10))
library(dplyr) #Data manipulation (also included in the tidyverse package)
library(tidyr) #Spread, separate, unite, text mining (also included in the tidyverse package)
library(widyr) #Use for pairwise correlation
#Visualizations!
library(ggplot2) #Visualizations (also included in the tidyverse package)
library(ggrepel) #`geom_label_repel`
library(gridExtra) #`grid.arrange()` for multi-graphs
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(knitr) #Create nicely formatted output tables
library(kableExtra) #Create nicely formatted output tables
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(formattable) #For the color_tile function
library(circlize) #Visualizations - chord diagram
## ========================================
## circlize version 0.4.8
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
## ========================================
# library(remotes)
# install_github("EmilHvitfeldt/textdata")
# install_github("juliasilge/tidytext")
library(tidytext) #Text mining
library(textdata)
afinn <- get_sentiments("afinn")
bing <- get_sentiments("bing")
nrc <- get_sentiments("nrc")
library(textdata)
library(tidytext)
library(radarchart)
nrc <- get_sentiments("nrc")
afinn <- get_sentiments("afinn")
bing <- get_sentiments("bing")
song70_nrc <- song_tidy70 %>%
inner_join(nrc) %>%
filter(!sentiment %in% c("negative", "positive")) %>%
group_by(sentiment) %>%
summarise(Sent_Count = sum(n)) %>%
mutate(Sentiment = (Sent_Count/sum(Sent_Count))*100)
## Joining, by = "word"
radar_chart_70s <- song70_nrc %>%
select(-Sent_Count) %>%
chartJSRadar(showToolTipLabel = TRUE, main = "1970s Sentiment")
radar_chart_70s
song80_nrc <- song_tidy80 %>%
inner_join(nrc) %>%
filter(!sentiment %in% c("negative", "positive")) %>%
group_by(sentiment) %>%
summarise(Sent_Count = sum(n)) %>%
mutate(Sentiment = (Sent_Count/sum(Sent_Count))*100)
## Joining, by = "word"
radar_chart_80s <- song80_nrc %>%
select(-Sent_Count) %>%
chartJSRadar(showToolTipLabel = TRUE, main = "1980s Sentiment")
radar_chart_80s
song90_nrc <- song_tidy90 %>%
inner_join(nrc) %>%
filter(!sentiment %in% c("negative", "positive")) %>%
group_by(sentiment) %>%
summarise(Sent_Count = sum(n)) %>%
mutate(Sentiment = (Sent_Count/sum(Sent_Count))*100)
## Joining, by = "word"
radar_chart_90s <- song90_nrc %>%
select(-Sent_Count) %>%
chartJSRadar(showToolTipLabel = TRUE, main = "1990s Sentiment")
radar_chart_90s
song00_nrc <- song_tidy00 %>%
inner_join(nrc) %>%
filter(!sentiment %in% c("negative", "positive")) %>%
group_by(sentiment) %>%
summarise(Sent_Count = sum(n)) %>%
mutate(Sentiment = (Sent_Count/sum(Sent_Count))*100)
## Joining, by = "word"
radar_chart_00s <- song00_nrc %>%
select(-Sent_Count) %>%
chartJSRadar(showToolTipLabel = TRUE, main = "2000s Sentiment")
radar_chart_00s
song10_nrc <- song_tidy10 %>%
inner_join(nrc) %>%
filter(!sentiment %in% c("negative", "positive")) %>%
group_by(sentiment) %>%
summarise(Sent_Count = sum(n)) %>%
mutate(Sentiment = (Sent_Count/sum(Sent_Count))*100)
## Joining, by = "word"
radar_chart_10s <- song10_nrc %>%
select(-Sent_Count) %>%
chartJSRadar(showToolTipLabel = TRUE, main = "2010s Sentiment")
radar_chart_10s
my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#D55E00")
theme_lyrics <- function()
{
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank(),
axis.ticks = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
}
lex_diversity_per_year <- songs %>%
unnest_tokens(word, Lyrics) %>%
group_by(Song.Title,Year) %>%
summarise(lex_diversity = n_distinct(word)) %>%
arrange(desc(lex_diversity))
diversity_plot <- lex_diversity_per_year %>%
ggplot(aes(Year, lex_diversity)) +
geom_point(color = my_colors[3],
alpha = .3,
size = 1, position = "jitter") +
stat_smooth(color = "black", se = FALSE, method = "lm") +
geom_smooth(aes(x = Year, y = lex_diversity), se = FALSE,
color = "blue", lwd = 2) +
ggtitle("Lexical Diversity") +
xlab("Year") +
ylab("") +
scale_color_manual(values = my_colors) +
theme_classic() +
theme_lyrics()
diversity_plot
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
lex_density_per_year <- songs %>%
unnest_tokens(word, Lyrics) %>%
group_by(Song.Title,Year) %>%
summarise(lex_density = n_distinct(word)/n()) %>%
arrange(desc(lex_density))
density_plot <- lex_density_per_year %>%
ggplot(aes(Year, lex_density)) +
geom_point(color = my_colors[4],
alpha = .4,
size = 2,
position = "jitter") +
stat_smooth(color = "black",
se = FALSE,
method = "lm") +
geom_smooth(aes(x = Year, y = lex_density),
se = FALSE,
color = "blue",
lwd = 2) +
ggtitle("Lexical Density") +
xlab("Year") +
ylab("") +
scale_color_manual(values = my_colors) +
theme_classic() +
theme_lyrics()
density_plot
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
library(tidytext)
library(RColorBrewer)
tfidf_words_decade <- songs %>%
unnest_tokens(word, Lyrics) %>%
distinct() %>%
filter(nchar(word) > 3) %>%
count(Decade, word, sort = TRUE) %>%
bind_tf_idf(word, Decade, n) %>%
arrange(desc(tf_idf))
top_tfidf_words_decade <- tfidf_words_decade %>%
group_by(Decade) %>%
slice(seq_len(8)) %>%
ungroup() %>%
arrange(Decade, tf_idf) %>%
mutate(row = row_number())
top_tfidf_words_decade %>%
ggplot(aes(x = row, tf_idf, fill = Decade)) +
geom_col(show.legend = NULL) +
labs(x = NULL, y = "TF-IDF") +
ggtitle("Important Words using TF-IDF by Decade") +
theme_lyrics() +
facet_wrap(~Decade,
ncol = 3, nrow = 2,
scales = "free") +
scale_x_continuous( # this handles replacement of row
breaks = top_tfidf_words_decade$row, # notice need to reuse dataframe
labels = top_tfidf_words_decade$word) +
coord_flip()
## Warning: `show.legend` must be a logical vector.
timeless_artists <- songs %>%
select(Artist, Year) %>%
group_by(Year) %>%
distinct(Artist) %>%
ungroup() %>%
group_by(Artist) %>%
count() %>%
arrange(desc(n)) %>%
head(25)
topartist_token <- songs %>%
mutate(timeless = ifelse(Artist == "Madonna" | Artist == "Elton John" | Artist == "Mariah Carey" | Artist == "Taylor Swift" | Artist == "Chicago" | Artist == "Stevie Wonder" | Artist == "Kelly Clarkson" | Artist == "Michael Jackson" | Artist == "Rihanna" | Artist == "Aerosmith" | Artist == "Daryl Hall and John Oates" | Artist == "Drake" | Artist == "Janet Jackson" | Artist == "Beyonce" | Artist == "Bon Jovi" | Artist == "Celine Dion" | Artist == "Cher" | Artist == "Commodores" | Artist == "Eminem" | Artist == "Eric Clapton" | Artist == "Justin Timberlake" | Artist == "Maroon 5" | Artist == "Olivia Newton-John" | Artist =="Rod Stewart" | Artist =="Usher", "Top Artist", "Total"))
tfidf_words_timeless <- topartist_token %>%
unnest_tokens(word, Lyrics) %>%
distinct() %>%
filter(nchar(word) > 3) %>%
count(timeless, word, sort = TRUE) %>%
bind_tf_idf(word, timeless, n) %>%
arrange(desc(tf_idf))
top_tfidf_words_timeless <- tfidf_words_timeless %>%
group_by(timeless) %>%
slice(seq_len(10)) %>%
ungroup() %>%
arrange(timeless, tf_idf) %>%
mutate(row = row_number())
top_tfidf_words_timeless %>%
ggplot(aes(x = row, tf_idf, fill = timeless)) +
geom_col(show.legend = NULL) +
labs(x = NULL, y = "TF-IDF") +
ggtitle("Important Words using TF-IDF by Decade") +
theme_lyrics() +
facet_wrap(~timeless,
ncol = 3, nrow = 2,
scales = "free") +
scale_x_continuous( # this handles replacement of row
breaks = top_tfidf_words_timeless$row, # notice need to reuse dataframe
labels = top_tfidf_words_timeless$word) +
coord_flip()
## Warning: `show.legend` must be a logical vector.
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(wordcloud)
##
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
##
## textplot
song_compcloud70 <- songtoken70 %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
song_tidy80 <- songtoken80 %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
song_tidy90 <- songtoken90 %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
song_tidy00 <- songtoken00 %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
song_tidy10 <- songtoken10 %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
song_bigrams <- songs %>%
unnest_tokens(bigram, Corpus, token = "ngrams", n = 2)
bigrams_separated <- song_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(nchar(word1) > 2) %>%
filter(nchar(word2) > 2)
bigram_decade <- bigrams_filtered %>%
filter(word1 != word2) %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, Decade, sort = TRUE) %>%
group_by(Decade) %>%
slice(seq_len(10)) %>%
ungroup() %>%
arrange(Decade, n) %>%
mutate(row = row_number())
ggplot(bigram_decade, aes(x= row, y = n, fill = Decade)) + geom_col(show.legend = FALSE) + facet_wrap(~Decade, scales = "free_y") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = bigram_decade$row, labels = bigram_decade$bigram) + theme(panel.grid.major.x = element_blank()) + ggtitle("Bigrams Per Decade") + coord_flip()
library(ggraph)
bigram_decade %>% select(bigram, n) %>%
separate(bigram, c("word1", "word2")) %>%
ggraph(layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
song_noun <- songs %>%
dplyr::select(Song.Title, Nouns, Year) %>%
unnest_tokens(output = word, input = Nouns) %>%
mutate(Upos = "NOUN") %>%
mutate(Year = Year)
song_verb <- songs %>%
dplyr::select(Song.Title, Verbs, Year) %>%
unnest_tokens(output = word, input = Verbs)%>%
mutate(Upos = "VERB") %>%
mutate(Year = Year)
song_adverb <- songs %>%
dplyr::select(Song.Title, Adverbs, Year) %>%
unnest_tokens(output = word, input = Adverbs)%>%
mutate(Upos = "ADV") %>%
mutate(Year = Year)
song_upos <- rbind(rbind(song_noun,song_verb), song_adverb)
t1 <- c("woah", "kiss", "life", "thing", "world", "song", "people")
t2 <- c("tonight", "dance", "party", "hand", "thing")
t3 <- c("friend", "head", "woman", "lover", "lady", "ladies")
t4 <- c("shit", "bitch", "bottom", "water", "pussi", "pussy")
t5 <- c("money", "bodies", "type", "bitch", "cash", "body")
t6 <- c("name", "chance", "town", "rain", "tear")
t7 <- c("gang", "taste", "girlfriend", "wrist", "chain")
t8 <- c("thunder", "star", "murder", "ghost", "wish")
topic_modeling <- function(topic, color, title){
topic_plot <- song_upos %>%
filter(word %in% topic) %>%
group_by(Year) %>%
mutate(topic_count = n()) %>%
select(Year, topic_count) %>%
distinct() %>%
ggplot(aes(Year, topic_count)) + geom_smooth(se = FALSE, col = color) + ggtitle(title)
topic_plot
}
colors <- c("#89C5DA", "#DA5724", "#74D944", "#CE50CA", "#3F4921", "#C0717C", "#CBD588", "#5F7FC7",
"#673770", "#D3D93E", "#38333E", "#508578", "#D7C1B1", "#689030", "#AD6F3B", "#CD9BCD",
"#D14285", "#6DDE88", "#652926", "#7FDCC0", "#C84248", "#8569D5", "#5E738F", "#D1A33D",
"#8A7C64", "#599861")
topic_modeling(t1, sample(colors, 1), "Passion/Longing")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t2, sample(colors, 1), "Party/Dance/Movement")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t3, sample(colors, 1), "Intimacy/Women")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t4, sample(colors, 1), "Sex/Objectification")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t5, sample(colors, 1), "Measurements of Clout")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t6, sample(colors, 1), "Impulsivity/Sadness")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t7, sample(colors, 1), "Commitment/Tether")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
topic_modeling(t8, sample(colors, 1), "Faith/Violence")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
library(reticulate)
import pandas as pd
import numpy as np
import requests
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import lyricsgenius as genius
import sys
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer
from datetime import datetime
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
from collections import Counter
from os import path
from PIL import Image
from keras.models import model_from_json
import pickle
import json
def collect_songs_from_billboard(start_year,end_year):
'''This function takes in a start year and and end year, then iterates through each year to
pull song data from billboard or bobborst as needed. Then it uses beautiful soup to clean
the data. Finally it stores the cleaned data in a dataframe and returns it
Parameters:
start_year (int): the year to start at.
end_year (int): the year to end at.
Returns:
dataframe.
'''
years = np.arange(start_year, end_year + 1).astype(int)
dataset = pd.DataFrame()
url_list = []
all_years = pd.DataFrame()
final_years = np.arange(2013,2020)
### Billboard doesn't have it's own complete results from 1970 to 2016,
### so we'll use bobborst.com as our primary and collect from billboard as needed
#URL Constructor
for i in range (0, len(years)):
url_list.append("http://billboardtop100of.com/" + str(years[i]) + "-2/")
for i in range(0, len(url_list)):
if years[i] in final_years:
sys.stdout.write("\r" + "Collecting Songs from " +str(years[i]) + " via https://www.billboard.com")
sys.stdout.flush()
url = "https://www.billboard.com/charts/year-end/" + str(years[i]) + "/hot-100-songs"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
all_ranks = soup.find_all("div", class_="ye-chart-item__rank")
all_titles = soup.find_all('div', class_="ye-chart-item__title")
all_artists = soup.find_all("div", class_="ye-chart-item__artist")
for j in range (0, len(all_ranks)):
row = {
"Rank": all_ranks[j].get_text(strip=True),
"Song Title": all_titles[j].get_text(strip=True),
"Artist": all_artists[j].get_text(strip=True),
"Year": years[i]
}
dataset = dataset.append(row, ignore_index=True)
else:
sys.stdout.write("\r" + "Collecting Songs from " +str(years[i]) + " via https://www.billboard.com")
sys.stdout.flush()
url = "http://billboardtop100of.com/" + str(years[i]) + "-2/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find_all('tr')
for j in range(0, len(table)):
columns = table[j].find_all('td')
row = {
"Rank": columns[0].get_text(strip=True),
"Artist": columns[1].get_text(strip=True),
"Song Title": columns[2].get_text(strip=True),
"Year": years[i]
}
dataset = dataset.append(row, ignore_index=True)
dataset['Year'] = dataset['Year'].astype(int)
return dataset
def add_spacy_data(dataset, feature_column):
'''
Grabs the verb, adverb, noun, and stop word Parts of Speech (POS)
tokens and pushes them into a new dataset. returns an
enriched dataset.
Parameters:
dataset (dataframe): the dataframe to parse
feature_column (string): the column to parse in the dataset.
Returns:
dataframe
'''
verbs = []
nouns = []
adverbs = []
corpus = []
nlp = spacy.load('en_core_web_sm')
##
for i in range (0, len(dataset)):
print("Extracting verbs and topics from record {} of {}".format(i+1, len(dataset)), end = "\r")
song = dataset.iloc[i][feature_column]
doc = nlp(song)
spacy_dataframe = pd.DataFrame()
for token in doc:
if token.lemma_ == "-PRON-":
lemma = token.text
else:
lemma = token.lemma_
row = {
"Word": token.text,
"Lemma": lemma,
"PoS": token.pos_,
"Stop Word": token.is_stop
}
spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)
corpus.append(corpus_clean)
dataset['Verbs'] = verbs
dataset['Nouns'] = nouns
dataset['Adverbs'] = adverbs
dataset['Corpus'] = corpus
return dataset
lyric_output = []
def pre_clean(dataset):
for i in range(0,len(dataset)):
oldlyric = dataset.iloc[i]['lyrics']
newlyric = re.sub(r'[^A-Za-z0-9]+', ' ', oldlyric)
lyric_output.append(newlyric)
dataset['lyrics'] = lyric_output
return dataset
def prep_corpus(raw_string):
'''Single use of add_spacy_data to enable pipelining
data into predictions
Parameters:
raw_string (string): String to be parsed
Returns:
parsed string
'''
verbs = []
nouns = []
adverbs = []
corpus = []
nlp = spacy.load('en_core_web_sm')
doc = nlp(raw_string)
spacy_dataframe = pd.DataFrame()
for token in doc:
if token.lemma_ == "-PRON-":
lemma = token.text
else:
lemma = token.lemma_
row = {
"Word": token.text,
"Lemma": lemma,
"PoS": token.pos_,
"Stop Word": token.is_stop
}
spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)
return corpus_clean
all_songs = collect_songs_from_billboard(1970, 2019)
all_songs["Artist"][all_songs['Artist'] == "Jackson 5"] = "The Jackson 5"
all_songs["Artist"][all_songs['Artist'] == "Beatles"] = "The Beatles"
api = genius.Genius("Gk7JH9g31J9T2nWV-o82WaGwIZQ_04LgbxcJypt4dBRdaGSH494rBORd2qMIVlzJ",sleep_time=0.01, verbose=False)
all_song_data = pd.DataFrame()
start_time = datetime.now()
print("Started at {}".format(start_time))
for i in range(0, len(all_songs)):
rolling_pct = int((i/len(all_songs))*100)
print(str(rolling_pct) + "% complete." + " Collecting Record " + str(i) +" of " +
str(len(all_songs)) +". Year " + str(all_songs.iloc[i]['Year']) + "." + " Currently collecting " +
all_songs.iloc[i]['Song Title'] + " by " + all_songs.iloc[i]['Artist'] + " "*50, end="\r")
song_title = all_songs.iloc[i]['Song Title']
song_title = re.sub(" and ", " & ", song_title)
song_title_test = re.sub(r'\W+', '', song_title).lower()
artist_name = all_songs.iloc[i]['Artist']
artist_name = re.sub(" and ", " & ", artist_name)
try:
song = api.search_song(song_title, artist=artist_name)
result_title = re.sub(r'\W+', '', song.title).lower()
if result_title == song_title_test:
song_album = song.album
song_album_url = song.album_url
featured_artists = song.featured_artists
song_lyrics = re.sub("\n", " ", song.lyrics) #Remove newline breaks, we won't need them.
song_media = song.media
song_url = song.url
song_writer_artists = song.writer_artists
song_year = song.year
else:
print(song_title)
print(result_title)
song_album = "null"
song_album_url = "null"
featured_artists = "null"
song_lyrics = "null"
song_media = "null"
song_url = "null"
song_writer_artists = "null"
song_year = "null"
except:
song_album = "null"
song_album_url = "null"
featured_artists = "null"
song_lyrics = "null"
song_media = "null"
song_url = "null"
song_writer_artists = "null"
song_year = "null"
row = {
"Year": all_songs.iloc[i]['Year'],
"Rank": all_songs.iloc[i]['Rank'],
"Song Title": all_songs.iloc[i]['Song Title'],
"Artist": all_songs.iloc[i]['Artist'],
"Album": song_album,
"Album URL": song_album_url,
"Featured Artists": featured_artists,
"Lyrics": song_lyrics,
"Media": song_media,
"Song URL": song_url,
"Writers": song_writer_artists,
"Release Date": song_year
}
all_song_data = all_song_data.append(row, ignore_index=True)
end_time = datetime.now()
print("\nCompleted at {}".format(start_time))
print("Total time to collect: {}".format(end_time - start_time))
all_song_data.to_csv("all_songs_data.csv")
all_song_data.to_json("all_song_data.json", orient='records')
loaded_song_dataset = pd.read_csv("all_songs_data.csv",index_col=0)
songs_with_lyrics_dataset = loaded_song_dataset.dropna(subset=['Lyrics'])
prepared_songs_dataset = add_spacy_data(songs_with_lyrics_dataset, 'Lyrics')
prepared_songs_dataset = prepared_songs_dataset.drop(columns = ['Unnamed: 0'])
word_counts = []
unique_word_counts = []
for i in range (0, len(prepared_songs_dataset)):
word_counts.append(len(prepared_songs_dataset.iloc[i]['Lyrics'].split()))
unique_word_counts.append(len(set(prepared_songs_dataset.iloc[i]['Lyrics'].split())))
prepared_songs_dataset['Word Counts'] = word_counts
prepared_songs_dataset['Unique Word Counts'] = unique_word_counts
prepared_songs_dataset = pd.read_csv('prepped_data.csv', index_col=0)
summary_dataset = pd.DataFrame()
years = prepared_songs_dataset['Year'].unique().tolist()
for i in range(0, len(years)):
row = {
"Year": years[i],
"Average Words": prepared_songs_dataset['Word Counts'][prepared_songs_dataset['Year'] == years[i]].mean(),
"Unique Words": prepared_songs_dataset['Unique Word Counts'][prepared_songs_dataset['Year'] == years[i]].mean()
}
summary_dataset = summary_dataset.append(row, ignore_index=True)
summary_dataset["Year"] = summary_dataset['Year'].astype(int)
characteristics = prepared_songs_dataset.groupby('Year').count()
import pandas as pd
import numpy as np
import requests
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import lyricsgenius as genius
import sys
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer
from datetime import datetime
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
from collections import Counter
from os import path
from PIL import Image
import pickle
import json
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
final_songs = pd.read_csv("final_songs.csv", index_col=0)
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
doc_set = final_songs['Corpus'].tolist()
stops = ["love", "time", "day", "night", "girl", "baby", "babi", "like", "chorus", "verse", "bridge", "yeah", "whoa", "because", "come", "nigga", "thing"]
texts = []
# loop through document list
for i in doc_set:
# clean and tokenize document string
raw = i.lower()
raw = re.sub(r'\b\w{1,3}\b', '', i)
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
stopped_tokens = [i for i in tokens if not i in stops]
# # stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
stemmed_tokens = [i for i in stemmed_tokens if not i in stops]
# stemmed_tokens = stopped_tokens
# add tokens to list
texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= 9, id2word = dictionary, passes=20)
lda_results = [(0, '0.136*"heart" + 0.025*"dream" + 0.021*"fire" + 0.018*"woah" + 0.017*"kiss"'), (1, '0.095*"life" + 0.074*"thing" + 0.049*"world" + 0.027*"song" + 0.019*"people"'), (2, '0.104*"tonight" + 0.042*"dance" + 0.037*"party" + 0.024*"hand" + 0.021*"thing"'), (3, '0.058*"friend" + 0.050*"head" + 0.040*"woman" + 0.034*"lover" + 0.029*"lady", ladies'), (4, '0.095*"shit" + 0.086*"bitch" + 0.016*"bottom" + 0.015*"water" + 0.014*"pussies + 0.014"'), (5, '0.098*"money" + 0.067*"bodies" + 0.022*"type" + 0.016*"bitch" + 0.015*"cash"'), (6, '0.037*"name" + 0.025*"chance" + 0.023*"town" + 0.019*"rain" + 0.017*"tear"'), (7, '0.066*"gang" + 0.056*"taste" + 0.039*"girlfriend" + 0.028*"wrist" + 0.019*"chain"'), (8, '0.109*"thunder" + 0.040*"star" + 0.033*"murder" + 0.022*"ghost" + 0.020*"wish"')]